experiment_name <- "fveg_1-18_gb"
experiment_path <- here("data", "4_for_analysis", "ML_outputs",  "experiments", experiment_name)
dataset <- "fveg"
months_ts <- TRUE

ET over fallow lands vs predictions

# read in the fallow fields predictions
fallow <- fread(here(experiment_path, "fallow.csv"))
fallow[fallow == -9999] <- NA # this is the na value

if (months_ts){
  # get a nice column of numeric months
  months <- select(fallow, names(fallow)[grepl("month", names(fallow))])
  fallow$month <- names(months)[max.col(months)] 
  fallow$month <- str_extract(fallow$month, '(?<=_)\\d+') %>% as.numeric()
}

A scatterplot with colors by month

if (months_ts){
  # with different months
  r2 <- summary(lm(ET~ET_pred, data=fallow))$r.squared 
  bias <- mean(fallow$ET_pred - fallow$ET, na.rm=TRUE)
  ggplot(fallow) +
    geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) + 
    theme_classic() + 
    geom_abline(intercept=0,slope=1, color="red") + 
    annotate("text", x=4*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) + 
    xlab("Predicted natural ET (mm/month)") + 
    ylab("Observed fallow ET (mm/month)")
  }

A scatterplot with months averaged out

if (months_ts){
  # averaging over different months
  fallow_year <- fallow %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
  fallow_year <- fallow
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=fallow_year))$r.squared 
bias <- mean(fallow_year$ET_pred - fallow_year$ET, na.rm=TRUE)
ggplot(fallow_year) +
  geom_jitter(aes(x=ET_pred, y=ET), alpha=0.2, size =.1) + 
  theme_classic() + 
  geom_abline(intercept=0,slope=1, color="red") + 
  annotate("text", x=3*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) + 
  xlab("Predicted natural ET (mm/month)") + 
  ylab("Observed fallow ET (mm/month)")

A time series

if (months_ts){
  # plot out the time series
  fallow_ts <- fallow %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
  
  ggplot(fallow_ts) + 
    geom_line(aes(x=month, y=ET), color="red") + 
    geom_line(aes(x=month, y=ET_pred), color="black") 
}

Here I just got curious about what ag ET vs fallow ET look like. Seems like fallow ET is probably artificially high.

# ag <- fread(here("data", "3_for_counterfactual", "agriculture", "agriculture.csv"))
# ag$month <- str_extract(ag$month, '\\b\\w+$') %>% as.numeric()
# 
# # plot out the time series
# ag_ts <- ag %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE))
# 
# ggplot(ag_ts) + 
#   geom_line(aes(x=month, y=ET), color="red") + 
#   geom_line(data=fallow_ts, aes(x=month, y=ET), color="black")

Spatial CV

# read in the spatial_CV predictions
cv <- fread(here(experiment_path, "crossval_predictions_train.csv"))
cv[cv == -9999] <- NA # this is the na value

if (months_ts){
  # get a nice column of numeric months
  months <- select(cv, names(cv)[grepl("month", names(cv))])
  cv$month <- names(months)[max.col(months)] 
  cv$month <- str_extract(cv$month, '(?<=_)\\d+') %>% as.numeric()
}

cv$mean_dist <- cv$fold_size/6 # the average distance between a held out point and the border to the hold out

map out the performance of the model as hold out sizes increase

if (months_ts){
  gridded_cv_metrics <- cv %>% 
    group_by(x, y, cv_fold, fold_size) %>%
    summarize(ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
    group_by(cv_fold, fold_size) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              bias = mean(ET_pred - ET, na.rm = TRUE), 
              rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
  gridded_cv_metrics <- cv %>%
    group_by(cv_fold, fold_size) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
              bias = mean(ET_pred - ET, na.rm = TRUE),
              rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE),
              ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'cv_fold'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'cv_fold'. You can override using the `.groups` argument.

Turn the cv fold back into something meaningful

# first turn "fold" into x and y coordinates
gridded_cv_metrics$x <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "[-.0-9]*(?=,)"))
gridded_cv_metrics$y <- as.numeric(str_extract(gridded_cv_metrics$cv_fold, "(?<=,)[-.0-9]*"))
# study area and counties for plotting
study_area <- st_read(study_area_loc) %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `study_area' from data source 
##   `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/study_area/study_area.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 3 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -218673.3 ymin: -352229.6 xmax: 135464.2 ymax: 256511.5
## Projected CRS: NAD83 / California Albers
counties <- st_read(counties_loc) %>% filter(STATEFP == "06") %>% st_transform(st_crs("+proj=longlat +datum=WGS84"))
## Reading layer `cb_2018_us_county_500k' from data source 
##   `/home/waves/users/annaboser/ET_ag_disALEXI/data/1_raw/counties/cb_2018_us_county_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 3233 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
## Geodetic CRS:  NAD83
filter(gridded_cv_metrics, fold_size !=1) %>%
  ggplot() + 
  geom_sf(data = counties, color=alpha("white",1), size = .2) + 
  geom_raster(aes(x=x, y=y, fill=rmse)) +
  facet_wrap(vars(fold_size/1000)) +
  scale_fill_distiller(palette="Spectral", direction = -1) + #, limits = c(0,2)) +
  geom_sf(data = study_area, fill=alpha("red",0), color = "black", size = .2) + 
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), 
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank())
## Warning: Raster pixels are placed at uneven horizontal intervals and will be
## shifted. Consider using geom_tile() instead.
## Warning: Raster pixels are placed at uneven vertical intervals and will be
## shifted. Consider using geom_tile() instead.

Plot out the performance of the model as hold out sizes get bigger relative to the distance between ag and natural training data

First get the distribution of distances between ag and natural data (specifically the test data)

dist_filename <- here(distance_distribution_path, paste0(dataset, ".csv"))

if (file.exists(dist_filename)){
  library(parallel)
  library(tictoc)
  
  dist <- fread(dist_filename)
} else {
  stop('No distance file for this dataset. Make one using 3_analysis/1_additional_data_manipulation/2_ag_natural_distance.R')
}
## 
## Attaching package: 'tictoc'
## The following object is masked from 'package:data.table':
## 
##     shift
if (months_ts){
  cv_metrics <- cv %>% 
    group_by(x, y, fold_size, mean_dist) %>%
    summarize(ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE)) %>%
    group_by(fold_size, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              bias = mean(ET_pred - ET, na.rm = TRUE), 
              rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
} else {
  cv_metrics <- cv %>%
    group_by(fold_size, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2),
              bias = mean(ET_pred - ET, na.rm = TRUE),
              rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2),
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE),
              ET_pred = mean(ET_pred, na.rm = TRUE))
}
## `summarise()` has grouped output by 'x', 'y', 'fold_size'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'fold_size'. You can override using the `.groups` argument.
coef = 2/5

ggplot(NULL) + 
  stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) + 
  geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
  geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef), color = "red") +
  xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) + 
  theme_bw() + 
  theme(axis.title.x = element_text(vjust=-2.5)) +
  scale_y_continuous(
    name = "Distribution of agricultural pixels",
    sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))

Plot the performance for the different months too

if (months_ts){
  cv_metrics <- cv %>% 
    group_by(fold_size, month, mean_dist) %>%
    summarize(r2 = 1 - mean((ET_pred - ET)^2)/mean((ET - mean(ET))^2), 
              bias = mean(ET_pred - ET, na.rm = TRUE), 
              rmse = sqrt(mean(ET_pred - ET, na.rm = TRUE)^2), 
              mae = mean(abs((ET_pred - ET)), na.rm = TRUE),
              ET = mean(ET, na.rm = TRUE), 
              ET_pred = mean(ET_pred, na.rm = TRUE))
  
  coef = 2/5
  
  ggplot(NULL) + 
    stat_density(data = dist, aes(x=mindist/1000), fill = "grey10", alpha = .3) + 
    geom_line(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
    geom_point(data = cv_metrics, aes(x=mean_dist/1000, y = r2*coef, color = as.factor(month))) +
    xlab(TeX("Distance of agricultural pixel to nearest natural pixel (km) \n Average distance of point to border of hold-out (km)")) + 
    theme_bw() + 
    theme(axis.title.x = element_text(vjust=-2.5)) +
    scale_y_continuous(
      name = "Distribution of agricultural pixels",
      sec.axis = sec_axis(~./coef, name=TeX("Model performance ($R^{2}$)")))
}
## `summarise()` has grouped output by 'fold_size', 'month'. You can override using
## the `.groups` argument.

Some scatterplots and time series

Make these using the CV with the fold size that is closest to the double of the mean distance between ag and natural (a bit conservative but only slightly)

mean_dist <- mean(dist$mindist)
folds <- unique(cv$mean_dist)
best_fold <- folds[abs(folds-(mean_dist)) == min(abs(folds-(mean_dist)))]
cv_best_fold <- filter(cv, mean_dist == best_fold)
print(cv_best_fold)
##                   x        y Agriculture FVEG CPAD CDL Elevation      Slope
##        1: -122.2117 40.30450          NA    1   NA  NA  120.4524 0.01331826
##        2: -122.2104 40.30450          NA    1   NA   1  121.0478 0.02825651
##        3: -122.2079 40.30450          NA    1   NA   1  120.1848 0.02384662
##        4: -122.2073 40.30450          NA    1   NA   1  119.2702 0.01329535
##        5: -122.2060 40.30450          NA    1   NA   1  119.2642 0.01675222
##       ---                                                                  
## 13568658: -118.9116 34.84324          NA    1   NA   1 1308.4076 0.35483694
## 13568659: -118.9103 34.84324          NA    1   NA  NA 1288.0094 0.06948160
## 13568660: -118.9090 34.84324          NA    1   NA  NA 1293.0698 0.27559921
## 13568661: -118.9084 34.84324          NA    1   NA  NA 1288.1407 0.30591771
## 13568662: -118.9116 34.84261          NA    1   NA   1 1332.6322 0.38480535
##              Aspect       Soil      TWI year        ET       PET month_1
##        1: 6.2714238 0.14174530 18.78722 2021 27.000000 0.9591289       1
##        2: 5.5350285 0.29090971 18.08801 2021 30.000000 0.9596014       1
##        3: 1.0666770 0.15410967 18.03865 2021 29.829855 0.9605466       1
##        4: 1.0363326 0.17099379 18.56828 2021 31.212646 0.9607829       1
##        5: 6.1137919 0.15492515 18.22674 2021 32.490223 0.9612554       1
##       ---                                                               
## 13568658: 0.8541588 0.01784754 15.74251 2021  7.157592 1.0310227       0
## 13568659: 2.7958539 0.07963096 16.72223 2021  6.802347 1.0306276       0
## 13568660: 3.0064828 0.23268874 15.94000 2021  2.549413 1.0302325       0
## 13568661: 2.6084707 0.24122863 15.90047 2021  2.098826 1.0300349       0
## 13568662: 1.1571892 0.06803262 15.57051 2021  4.838519 1.0311517       0
##           month_2 month_3 month_4 month_5 month_6 month_7 month_8 month_9
##        1:       0       0       0       0       0       0       0       0
##        2:       0       0       0       0       0       0       0       0
##        3:       0       0       0       0       0       0       0       0
##        4:       0       0       0       0       0       0       0       0
##        5:       0       0       0       0       0       0       0       0
##       ---                                                                
## 13568658:       0       0       0       0       0       0       0       0
## 13568659:       0       0       0       0       0       0       0       0
## 13568660:       0       0       0       0       0       0       0       0
## 13568661:       0       0       0       0       0       0       0       0
## 13568662:       0       0       0       0       0       0       0       0
##           month_10 month_11 month_12 fold_size
##        1:        0        0        0     10000
##        2:        0        0        0     10000
##        3:        0        0        0     10000
##        4:        0        0        0     10000
##        5:        0        0        0     10000
##       ---                                     
## 13568658:        0        0        1     10000
## 13568659:        0        0        1     10000
## 13568660:        0        0        1     10000
## 13568661:        0        0        1     10000
## 13568662:        0        0        1     10000
##                                         cv_fold   ET_pred month mean_dist
##        1: -122.24719101123596,40.27027027027027 25.264435     1  1666.667
##        2: -122.24719101123596,40.27027027027027 24.291529     1  1666.667
##        3: -122.24719101123596,40.27027027027027 23.829910     1  1666.667
##        4: -122.24719101123596,40.27027027027027 24.255668     1  1666.667
##        5: -122.24719101123596,40.27027027027027 25.264435     1  1666.667
##       ---                                                                
## 13568658: -118.98876404494382,34.77477477477478 21.539060    12  1666.667
## 13568659: -118.98876404494382,34.77477477477478 12.021730    12  1666.667
## 13568660: -118.98876404494382,34.77477477477478  9.390516    12  1666.667
## 13568661: -118.98876404494382,34.77477477477478  9.186157    12  1666.667
## 13568662: -118.98876404494382,34.77477477477478 15.883162    12  1666.667

A scatterplot with colors by month

if (months_ts){
  # with different months
  r2 <- summary(lm(ET~ET_pred, data=cv_best_fold))$r.squared 
  bias <- mean(cv_best_fold$ET_pred - cv_best_fold$ET, na.rm=TRUE)
  ggplot(cv_best_fold) +
    geom_jitter(aes(x=ET_pred, y=ET, color=as.factor(month)), alpha=0.2, size =.1) + 
    theme_classic() + 
    geom_abline(intercept=0,slope=1, color="red") + 
    # annotate("text", x=4.5, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) + 
    xlab("Predicted natural ET (mm/month)") + 
    ylab("Observed fallow ET (mm/month)")
}

print(paste("R2:", round(r2, 3), "Bias:", round(bias, 3)))
## [1] "R2: 0.642 Bias: 0.403"

A scatterplot with months averaged out

if (months_ts){
  # averaging over different months
  cv_best_fold_year <- cv_best_fold %>% group_by(x, y) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
} else {
  cv_best_fold_year <- cv_best_fold
}
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
r2 <- summary(lm(ET~ET_pred, data=cv_best_fold_year))$r.squared 
bias <- mean(cv_best_fold_year$ET_pred - cv_best_fold_year$ET, na.rm=TRUE)
ggplot(cv_best_fold_year) +
  # stat_density_2d(aes(x=ET_pred, y=ET,fill = stat(density)), geom = 'raster', contour = FALSE) + 
  # scale_fill_viridis_c() +
  geom_jitter(aes(x=ET_pred, y=ET), alpha=0.01, size =.01, color = 'black') +
  theme(legend.position="none") + 
  theme_classic() + 
  geom_abline(intercept=0,slope=1, color="red") + 
  # annotate("text", x=4.5*30, y=1.5, label= paste("R2:", round(r2, 3), "Bias:", round(bias, 3))) + 
  xlab("Predicted natural ET (mm/month)") + 
  ylab("Observed fallow ET (mm/month)")

print(paste("R2:", round(r2, 3), "Bias:", round(bias, 3)))
## [1] "R2: 0.651 Bias: 0.403"

A time series

if (months_ts){
  # plot out the time series
  cv_best_fold_ts <- cv_best_fold %>% group_by(month) %>% summarize(ET = mean(ET, na.rm=TRUE), 
                                                         ET_pred = mean(ET_pred, na.rm=TRUE))
  
  ggplot(cv_best_fold_ts) + 
    geom_line(aes(x=month, y=ET), color="red") + 
    geom_line(aes(x=month, y=ET_pred), color="black") + 
    geom_line(data = fallow_ts, aes(x=month, y=ET), color="blue") + 
    geom_line(data = fallow_ts, aes(x=month, y=ET_pred), color="green") 
}